rm(list=ls())

Load requisite packages and data

library(ezids)
library(ggplot2)
library(dplyr)
library(readr)
library(tidyverse)
library (tidyr)
library(janitor)
library(scales)
library(ggrepel)
library(corrplot)

This project uses the State and County Housing Market Indicators dataset from the American Enterprise Institute Housing Center, found here. The variables are:

Original Variable Name New Variable Name Definition
State State state
County County_Name County
FIPS FIPS_County_Code 5-digit Federal Information Processing Series codes (first 2 digits indicate state, last 3 indicate sub-county entity)
Year Year Year when the data was collected
Tier Affordability Categorizes home sales into entry-level (<=80th percentile of FHA sales prices), move-up (all others), and all
Median.Sale.Price..in.Thousands. Median_Sale_Price_in_k Median sale price in thousands of USD per county
House.Price.Appreciation.since.2012 House_Price_Appreciation_since_2012_percent Cumulative home price appreciation since 2012
House.Price.Appreciation..Year.over.Year House_Price_Appreciation_yr_over_yr_percent Home price appreciation since the previous year
Months..Supply Months_Supply Number of months it would take for the inventory of existing homes for sale to be exhausted at the current sales pace
New.Construction.Share.of.Sales New_Constr_by_share_of_sales_percent Percent of sales comprising new construction
Mortgage.Default.Rate Mortgage_Default_Rate_percent AEI Mortgage Default Rate, a measure of how loans originating in a given month would perform under the same conditions as the 2007 financial crisis (<=7%: Low Risk; between 7.01% and 14%: Medium Risk; >14%: High Risk)
housing = read.csv("/Users/ilgazkuscu/Documents/GitHub/housing-price-vs-supply-2024/Data/state_county_data_download_2025.csv")
housing %>% slice_sample(n=5)
##   State        County  FIPS Year       Tier Median.Sale.Price..in.Thousands.
## 1    GA Jasper County 13159 2023        all                            $234 
## 2    NC   Hyde County 37095 2017        all                            $157 
## 3    ID Canyon County 16027 2023        all                            $394 
## 4    PA Mckean County 42083 2019        all                             $65 
## 5    TX   Ward County 48475 2018 entrylevel                            $108 
##   House.Price.Appreciation.since.2012 House.Price.Appreciation..Year.over.Year.
## 1                             188.40%                                     2.10%
## 2                              34.30%                                    14.80%
## 3                             253.00%                                     1.90%
## 4                              19.50%                                     6.00%
## 5                              39.80%                                     9.20%
##   Months..Supply New.Construction.Share.of.Sales Mortgage.Default.Rate
## 1            4.7                          21.40%                19.40%
## 2            6.5                           3.30%                 9.00%
## 3            5.5                          36.10%                16.20%
## 4            6.4                           3.10%                17.70%
## 5            1.4                           4.50%                25.50%

The data is limited to the year 2024 and cleaned of NA values, and the variables are renamed for clarity.

housing_2024 = housing %>% filter(housing$Year == 2024, 
                                  housing$State != 'AA National', 
                                  housing$County != 'AA State') %>% na.omit %>% 
  #excluding the armed forces


#rename cols
rename(
  Median_Sale_Price_per_k = Median.Sale.Price..in.Thousands.,
  House_Price_Appreciation_yr_over_yr_percent = House.Price.Appreciation..Year.over.Year.,
  House_Price_Appreciation_since_2012_percent = House.Price.Appreciation.since.2012,
  Months_Supply = Months..Supply,
  New_Constr_by_share_of_sales_percent = New.Construction.Share.of.Sales,
  Mortgage_Default_Rate_percent = Mortgage.Default.Rate,
  County_Name = County,
  FIPS_County_Code = FIPS, 
  Affordability = Tier
)
head(housing_2024)
##   State                County_Name FIPS_County_Code Year Affordability
## 4    AK Aleutians West Census Area             2016 2024           all
## 5    AK Aleutians West Census Area             2016 2024    entrylevel
## 6    AK Aleutians West Census Area             2016 2024        moveup
## 7    AK     Anchorage Municipality             2020 2024           all
## 8    AK     Anchorage Municipality             2020 2024    entrylevel
## 9    AK     Anchorage Municipality             2020 2024        moveup
##   Median_Sale_Price_per_k House_Price_Appreciation_since_2012_percent
## 4                                                                    
## 5                                                                    
## 6                                                                    
## 7                   $415                                       51.90%
## 8                   $325                                       49.70%
## 9                   $581                                       54.20%
##   House_Price_Appreciation_yr_over_yr_percent Months_Supply
## 4                                                       1.8
## 5                                                       2.8
## 6                                                       1.6
## 7                                       4.50%           2.5
## 8                                       4.00%           2.2
## 9                                       5.00%           2.8
##   New_Constr_by_share_of_sales_percent Mortgage_Default_Rate_percent
## 4                                                                   
## 5                                                                   
## 6                                                                   
## 7                                4.40%                        12.40%
## 8                                1.90%                        13.60%
## 9                                7.90%                         9.80%

The typing of the variables is also corrected. Some require the symbols “$” and “%” to be removed beforehand, so that is also done.

# as factors
housing_2024$State = as.factor(housing_2024$State)
housing_2024$County_Name = as.factor(housing_2024$County_Name)
housing_2024$FIPS_County_Code = as.factor(housing_2024$FIPS_County_Code)
housing_2024$Affordability = as.factor(housing_2024$Affordability)

# remove prefixes '$' and '%' from values
housing_2024 = housing_2024 %>%
  mutate(Median_Sale_Price_per_k = gsub("\\$", "", Median_Sale_Price_per_k),
         House_Price_Appreciation_since_2012_percent =
           gsub("%","",House_Price_Appreciation_since_2012_percent),
         House_Price_Appreciation_yr_over_yr_percent =
           gsub("%","",House_Price_Appreciation_yr_over_yr_percent),
         New_Constr_by_share_of_sales_percent = gsub("%","",New_Constr_by_share_of_sales_percent),
         Mortgage_Default_Rate_percent = gsub("%","",Mortgage_Default_Rate_percent)
  )

# as num instead of chr
housing_2024$Median_Sale_Price_per_k = as.numeric(housing_2024$Median_Sale_Price_per_k)
housing_2024$House_Price_Appreciation_since_2012_percent =
  as.numeric(housing_2024$House_Price_Appreciation_since_2012_percent)
housing_2024$House_Price_Appreciation_yr_over_yr_percent =
  as.numeric(housing_2024$House_Price_Appreciation_yr_over_yr_percent)
housing_2024$New_Constr_by_share_of_sales_percent = 
  as.numeric(housing_2024$New_Constr_by_share_of_sales_percent)
housing_2024$Mortgage_Default_Rate_percent = as.numeric(housing_2024$Mortgage_Default_Rate_percent)

# For some reason is rounding the data in quite a weird way—inaccurately

# view data
housing_2024 %>% slice_sample(n=5)
##   State     County_Name FIPS_County_Code Year Affordability
## 1    KS   Seward County            20175 2024    entrylevel
## 2    AR    Clark County             5019 2024        moveup
## 3    IL   Putnam County            17155 2024           all
## 4    KY Nicholas County            21181 2024    entrylevel
## 5    IL     Cass County            17017 2024        moveup
##   Median_Sale_Price_per_k House_Price_Appreciation_since_2012_percent
## 1                      NA                                          NA
## 2                     325                                        58.9
## 3                      NA                                          NA
## 4                      94                                        98.4
## 5                     233                                          NA
##   House_Price_Appreciation_yr_over_yr_percent Months_Supply
## 1                                          NA           3.4
## 2                                         2.3           7.9
## 3                                          NA           4.7
## 4                                        -1.5           1.9
## 5                                          NA           3.3
##   New_Constr_by_share_of_sales_percent Mortgage_Default_Rate_percent
## 1                                   NA                            NA
## 2                                 10.3                          12.9
## 3                                   NA                            NA
## 4                                  0.0                            NA
## 5                                  0.0                            NA
#creating df w/ affordability all totals and then another with entry level/moving up totals
housing_2024_all = housing_2024 %>% filter(housing_2024$Affordability == "all") 
housing_2024_tiers = housing_2024 %>% filter(Affordability == "entrylevel" | Affordability == "moveup")

Exploratory Data Analysis (EDA)

xkablesummary(housing_2024)
Table: Statistics summary.
State County_Name FIPS_County_Code Year Affordability Median_Sale_Price_per_k House_Price_Appreciation_since_2012_percent House_Price_Appreciation_yr_over_yr_percent Months_Supply New_Constr_by_share_of_sales_percent Mortgage_Default_Rate_percent
Min TX : 730 Washington County: 87 1001 : 3 Min. :2024 all :3046 Min. : 14 Min. : 3.7 Min. :-32.90 Min. : 0.00 Min. : 0.0 Min. : 0.2
Q1 GA : 477 Jefferson County : 74 1003 : 3 1st Qu.:2024 entrylevel:3043 1st Qu.:148 1st Qu.: 86.6 1st Qu.: 2.80 1st Qu.: 2.20 1st Qu.: 2.7 1st Qu.:10.2
Median KY : 360 Franklin County : 72 1005 : 3 Median :2024 moveup :2996 Median :255 Median :105.2 Median : 5.70 Median : 3.20 Median : 6.8 Median :13.9
Mean MO : 334 Jackson County : 68 1007 : 3 Mean :2024 NA Mean :284 Mean :110.7 Mean : 5.88 Mean : 4.37 Mean : 10.7 Mean :14.3
Q3 VA : 323 Lincoln County : 66 1009 : 3 3rd Qu.:2024 NA 3rd Qu.:389 3rd Qu.:129.6 3rd Qu.: 8.70 3rd Qu.: 5.20 3rd Qu.: 14.9 3rd Qu.:17.5
Max IL : 304 Madison County : 57 1011 : 3 Max. :2024 NA Max. :999 Max. :279.1 Max. : 89.20 Max. :24.00 Max. :100.0 Max. :36.0
NA (Other):6557 (Other) :8661 (Other):9067 NA NA NA’s :990 NA’s :2177 NA’s :2204 NA NA’s :859 NA’s :1960
#separated df's
xkablesummary(housing_2024_all)
Table: Statistics summary.
State County_Name FIPS_County_Code Year Affordability Median_Sale_Price_per_k House_Price_Appreciation_since_2012_percent House_Price_Appreciation_yr_over_yr_percent Months_Supply New_Constr_by_share_of_sales_percent Mortgage_Default_Rate_percent
Min TX : 249 Washington County: 29 1001 : 1 Min. :2024 all :3046 Min. : 17 Min. : 3.7 Min. :-28.70 Min. : 0.10 Min. : 0.0 Min. : 0.5
Q1 GA : 159 Jefferson County : 25 1003 : 1 1st Qu.:2024 entrylevel: 0 1st Qu.:140 1st Qu.: 88.9 1st Qu.: 3.00 1st Qu.: 2.30 1st Qu.: 2.9 1st Qu.:11.4
Median KY : 120 Franklin County : 24 1005 : 1 Median :2024 moveup : 0 Median :199 Median :107.3 Median : 5.95 Median : 3.10 Median : 6.4 Median :14.5
Mean MO : 112 Jackson County : 23 1007 : 1 Mean :2024 NA Mean :236 Mean :113.4 Mean : 6.00 Mean : 3.63 Mean : 9.5 Mean :14.8
Q3 VA : 108 Lincoln County : 22 1009 : 1 3rd Qu.:2024 NA 3rd Qu.:300 3rd Qu.:132.6 3rd Qu.: 8.60 3rd Qu.: 4.30 3rd Qu.: 12.5 3rd Qu.:17.6
Max IL : 102 Madison County : 19 1011 : 1 Max. :2024 NA Max. :998 Max. :278.0 Max. : 89.20 Max. :24.00 Max. :100.0 Max. :36.0
NA (Other):2196 (Other) :2904 (Other):3040 NA NA NA’s :309 NA’s :588 NA’s :596 NA NA’s :276 NA’s :618
xkablesummary(housing_2024_tiers)
Table: Statistics summary.
State County_Name FIPS_County_Code Year Affordability Median_Sale_Price_per_k House_Price_Appreciation_since_2012_percent House_Price_Appreciation_yr_over_yr_percent Months_Supply New_Constr_by_share_of_sales_percent Mortgage_Default_Rate_percent
Min TX : 481 Washington County: 58 1001 : 2 Min. :2024 all : 0 Min. : 14 Min. : 4.5 Min. :-32.90 Min. : 0.00 Min. : 0.0 Min. : 0.2
Q1 GA : 318 Jefferson County : 49 1003 : 2 1st Qu.:2024 entrylevel:3043 1st Qu.:153 1st Qu.: 85.4 1st Qu.: 2.70 1st Qu.: 2.20 1st Qu.: 2.5 1st Qu.: 9.7
Median KY : 240 Franklin County : 48 1005 : 2 Median :2024 moveup :2996 Median :310 Median :104.4 Median : 5.60 Median : 3.40 Median : 7.2 Median :13.6
Mean MO : 222 Jackson County : 45 1007 : 2 Mean :2024 NA Mean :308 Mean :109.2 Mean : 5.82 Mean : 4.74 Mean : 11.3 Mean :14.0
Q3 VA : 215 Lincoln County : 44 1009 : 2 3rd Qu.:2024 NA 3rd Qu.:415 3rd Qu.:128.3 3rd Qu.: 8.70 3rd Qu.: 5.90 3rd Qu.: 16.0 3rd Qu.:17.5
Max IL : 202 Madison County : 38 1011 : 2 Max. :2024 NA Max. :999 Max. :279.1 Max. : 86.80 Max. :24.00 Max. :100.0 Max. :36.0
NA (Other):4361 (Other) :5757 (Other):6027 NA NA NA’s :681 NA’s :1589 NA’s :1608 NA NA’s :583 NA’s :1342

boxplot of median sale price by state

ggplot(housing_2024, aes(x = reorder(State, -Median_Sale_Price_per_k, median), 
                         y = Median_Sale_Price_per_k)) +
  geom_boxplot(fill = "steelblue", alpha = 0.7) +
  coord_flip() +
  labs(
    title = "Distribution of Median Sale Prices by State (2024)",
    x = "State",
    y = "Median Sale Price (in thousands)"
  ) +
  theme_minimal()

There are too many to be particularly useful. You can see general trends, but I am going to run this with a smaller state sample.

#boxplotting med sale price by state w/ ..._all df
#all
ggplot(housing_2024_all, aes(x = reorder(State, -Median_Sale_Price_per_k, median), 
                         y = Median_Sale_Price_per_k)) +
  geom_boxplot(fill = "steelblue", alpha = 0.7) +
  coord_flip() +
  labs(
    title = "Distribution of Median Sale Prices by State (2024)",
    x = "State",
    y = "Median Sale Price (in thousands)"
  ) +
  theme_minimal()

boxplot of top and bottom 5 states by housing count

top_states <- housing_2024 %>%
  dplyr::count(State, sort = TRUE) %>%
  dplyr::slice_max(n, n = 5)

bottom_states <- housing_2024 %>%
  dplyr::count(State, sort = TRUE) %>%
  dplyr::slice_min(n, n = 5)

# merge top and bottom states
housing_compare <- housing_2024 %>%
  filter(State %in% c(top_states$State, bottom_states$State)) %>%
  mutate(StateGroup = case_when(
    State %in% top_states$State ~ "States with the Most Houses",
    State %in% bottom_states$State ~ "States with the Fewest Houses"
  ))%>%
  mutate(StateGroup = factor(StateGroup, levels = c(
    "States with the Most Houses",
    "States with the Fewest Houses"
  )))

#plot top bottom comparison ####
ggplot(housing_compare, aes(x = State, y = Median_Sale_Price_per_k, fill = StateGroup)) +
  geom_boxplot() +
  facet_wrap(~ StateGroup, scales = "free_x") +
  labs(
    title = "Median Sale Price in States with Most vs Least Housing Records",
    subtitle = "The Median Sale Price in States with a Larger Supply of Houses is Significantly Lower\nthan States with a Smaller Supply of Houses",
    x = "State",
    y = "Median Sale Price (in thousands)"
  ) +
  theme_minimal() +
  scale_fill_manual(values = c("States with the Most Houses" = "skyblue", "States with the Fewest Houses" = "red"))

#boxplotting with new df: ..._all
top_states_all <- housing_2024_all %>%
  dplyr::count(State, sort = TRUE) %>%
  dplyr::slice_max(n, n = 5)

bottom_states_all <- housing_2024_all %>%
  dplyr::count(State, sort = TRUE) %>%
  dplyr::slice_min(n, n = 5)

# merge top and bottom states
housing_compare_all <- housing_2024_all %>%
  filter(State %in% c(top_states_all$State, bottom_states_all$State)) %>%
  mutate(StateGroup = case_when(
    State %in% top_states_all$State ~ "States with the Most Houses",
    State %in% bottom_states_all$State ~ "States with the Fewest Houses"
  ))%>%
  mutate(StateGroup = factor(StateGroup, levels = c(
    "States with the Most Houses",
    "States with the Fewest Houses"
  )))

#plot top bottom comparison ####
ggplot(housing_compare_all, aes(x = State, y = Median_Sale_Price_per_k, fill = StateGroup)) +
  geom_boxplot() +
  facet_wrap(~ StateGroup, scales = "free_x") +
  labs(
    title = "Median Sale Price in States with Most vs Least Housing Records",
    subtitle = "The Median Sale Price in States with a Larger Supply of Houses is Significantly Lower\nthan States with a Smaller Supply of Houses",
    x = "State",
    y = "Median Sale Price (in thousands)"
  ) +
  theme_minimal() +
  scale_fill_manual(values = c("States with the Most Houses" = "skyblue", "States with the Fewest Houses" = "red"))

hist of median sale price

ggplot(housing_2024, aes(x = Median_Sale_Price_per_k)) +
  geom_histogram(binwidth = 50, fill = "skyblue", color = "black") +
  scale_x_continuous(labels = scales::dollar_format(prefix = "$", suffix = "k")) +
  labs(
    title = "Distribution of Median Sale Prices (2024)",
    x = "Median Sale Price (in thousands)",
    y = "Count"
  ) +
  theme_minimal()

#needs a subtitle with commentary, just remembered all need a caption with the source named, and we can prob add other elements to this, because otherwise it seems to obvi to me
#hist plotting with df ..._all
ggplot(housing_2024_all, aes(x = Median_Sale_Price_per_k)) +
  geom_histogram(binwidth = 50, fill = "skyblue", color = "black") +
  scale_x_continuous(labels = scales::dollar_format(prefix = "$", suffix = "k")) +
  labs(
    title = "Distribution of Median Sale Prices (2024)",
    x = "Median Sale Price (in thousands)",
    y = "Count"
  ) +
  theme_minimal()

#needs a subtitle with commentary, just remembered all need a caption with the source named, and we can prob add other elements to this, because otherwise it seems to obvi to me

boxplot of sale price by affordability tier

ggplot(housing_2024, aes(x = Affordability, y = Median_Sale_Price_per_k, fill = Affordability)) +
  geom_boxplot() +
  labs(
    title = "Median Sale Price by Affordability Tier (2024)",
    x = "Affordability Tier",
    y = "Median Sale Price (in thousands)"
  ) +
  theme_minimal()

#no duh, there are more unpurchased expensive houses because people can't afford it 
#not sure how useful this is, but maybe as a starting baseline 

scatterplot of months supply vs median price

ggplot(housing_2024_all, aes(x = Months_Supply, y = Median_Sale_Price_per_k)) +
  geom_point(color="steelblue", alpha = 0.7) +
  # geom_smooth(method = "lm", se = FALSE, color = "black") +
  labs(
    title = "Housing Supply vs Median Sale Price",
    x = "Months of Supply",
    y = "Median Sale Price (in thousands)"
  ) +
  theme_minimal()

#this is a good one, needs a commentary subtitle and some cleaning

correlation heatmap of numeric variables

housing_numeric <- housing_2024 %>%
  select(where(is.numeric) & !all_of("Year")) %>%
  drop_na()

cor_matrix <- cor(housing_numeric)

corrplot(cor_matrix, method = "color", type = "upper", tl.cex = 0.8, addCoef.col = "black", number.cex=0.5)

#interesting to see positive and inverse relationships between variables 
#did not scan for no relationships
#correlation heatmap of numeric variables w/ ..._all df
housing_numeric_all <- housing_2024_all %>%
  select(where(is.numeric) & !all_of("Year")) %>%
  drop_na()

cor_matrix_all <- cor(housing_numeric_all)

corrplot(cor_matrix_all, method = "color", type = "upper", tl.cex = 0.8, addCoef.col = "black", number.cex=0.5)

compare color scale

# Create a vector of colors for top states (blue shades) and bottom states (red shades)
top_states_colors <- scales::seq_gradient_pal("lightblue", "darkslateblue")(seq(0, 1, length.out = length(unique(housing_compare$State[housing_compare$StateGroup == "States with the Most Houses"]))))
names(top_states_colors) <- unique(housing_compare$State[housing_compare$StateGroup == "States with the Most Houses"])

bottom_states_colors <- scales::seq_gradient_pal("lightpink", "darkred")(seq(0, 1, length.out = length(unique(housing_compare$State[housing_compare$StateGroup == "States with the Fewest Houses"]))))
names(bottom_states_colors) <- unique(housing_compare$State[housing_compare$StateGroup == "States with the Fewest Houses"])

#last point in line for state label 
label_points <- housing_compare %>%
  group_by(State) %>%
  filter(Months_Supply == max(Months_Supply, na.rm = TRUE)) %>%
  ungroup()

# Combine into one color vector
state_colors <- c(top_states_colors, bottom_states_colors)

#all in gray with faceted compare in color with states labeled ####
ggplot() +
  geom_point(data = housing_2024, aes(x = Months_Supply, y = Median_Sale_Price_per_k),
             color = "gray70", alpha = 0.3, size = 1) +
  # geom_path(data = housing_compare,
  #           aes(x = Months_Supply, y = Median_Sale_Price_per_k, group = State, color = State),
  #           size = 1, alpha = 0.8) +
  geom_point(data = housing_compare,
             aes(x = Months_Supply, y = Median_Sale_Price_per_k, color = State),
             size = 2, alpha = 0.5) +
  geom_text_repel(data = label_points,
                  aes(x = Months_Supply, y = Median_Sale_Price_per_k, 
                      label = State, color=State),
                  size = 3.5, stroke=0.01, show.legend = FALSE) +
  facet_wrap(~ StateGroup) +
  scale_color_manual(values = state_colors) +
  labs(
    title = "Housing Supply vs Median Price by State with Grouped Colors",
    subtitle = "Some commentary here",
    x = "Months of Supply",
    y = "Median Sale Price (in thousands)",
    color = "State"
  ) +
  theme_minimal()

#can't read state names, maybe find a way to make it stand out 
# compare color scale w/ ..._all df
# Create a vector of colors for top states (blue shades) and bottom states (red shades)
top_states_colors_all <- scales::seq_gradient_pal("lightblue", "darkslateblue")(seq(0, 1, length.out = length(unique(housing_compare_all$State[housing_compare_all$StateGroup == "States with the Most Houses"]))))
names(top_states_colors_all) <- unique(housing_compare_all$State[housing_compare_all$StateGroup == "States with the Most Houses"])

bottom_states_colors_all <- scales::seq_gradient_pal("lightpink", "darkred")(seq(0, 1, length.out = length(unique(housing_compare_all$State[housing_compare_all$StateGroup == "States with the Fewest Houses"]))))
names(bottom_states_colors_all) <- unique(housing_compare_all$State[housing_compare_all$StateGroup == "States with the Fewest Houses"])

#last point in line for state label 
label_points_all <- housing_compare_all %>%
  group_by(State) %>%
  filter(Months_Supply == max(Months_Supply, na.rm = TRUE)) %>%
  ungroup()

# Combine into one color vector
state_colors_all <- c(top_states_colors_all, bottom_states_colors_all)

#all in gray with faceted compare in color with states labeled ####
ggplot() +
  geom_point(data = housing_2024_all, aes(x = Months_Supply, y = Median_Sale_Price_per_k),
             color = "gray70", alpha = 0.3, size = 1) +
  # geom_path(data = housing_compare,
  #           aes(x = Months_Supply, y = Median_Sale_Price_per_k, group = State, color = State),
  #           size = 1, alpha = 0.8) +
  geom_point(data = housing_compare_all,
             aes(x = Months_Supply, y = Median_Sale_Price_per_k, color = State),
             size = 2, alpha = 0.5) +
  geom_text_repel(data = label_points_all,
                  aes(x = Months_Supply, y = Median_Sale_Price_per_k, 
                      label = State, color=State),
                  size = 3.5, stroke=0.01, show.legend = FALSE) +
  facet_wrap(~ StateGroup) +
  scale_color_manual(values = state_colors_all) +
  labs(
    title = "Housing Supply vs Median Price by State with Grouped Colors",
    subtitle = "Some commentary here",
    x = "Months of Supply",
    y = "Median Sale Price (in thousands)",
    color = "State"
  ) +
  theme_minimal()

#can't read state names, maybe find a way to make it stand out 

both together in gray with compare in color with states labeled

ggplot() +
  geom_point(data = housing_2024, 
             aes(x = Months_Supply, y = Median_Sale_Price_per_k), 
             color = "gray70", alpha = 0.3, size = 1) +
  # geom_path(data = housing_compare %>% filter(StateGroup == "States with the Most Houses"),
  #           aes(x = Months_Supply, y = Median_Sale_Price_per_k, group = State, color = State),
  #           alpha = 0.8, size = 1) +
  geom_point(data = housing_compare %>% filter(StateGroup == "States with the Most Houses"),
             aes(x = Months_Supply, y = Median_Sale_Price_per_k, color = State),
             alpha = 0.5, size = 2) +
  # geom_path(data = housing_compare %>% filter(StateGroup == "States with the Fewest Houses"),
  #           aes(x = Months_Supply, y = Median_Sale_Price_per_k, group = State, color = State),
  #           alpha = 0.8, size = 1) +
  geom_point(data = housing_compare %>% filter(StateGroup == "States with the Fewest Houses"),
             aes(x = Months_Supply, y = Median_Sale_Price_per_k, color = State),
             alpha = 0.5, size = 2) +
  scale_color_manual(values = state_colors) +
  labs(
    title = "Housing Supply vs Median Price: All Counties with Highlights",
    subtitle = "Gray points: All counties | Blue shades: States with Most Houses | Red shades: States with Fewest Houses",
    x = "Months of Supply",
    y = "Median Sale Price (in thousands)",
    color = "State"
  ) +
  theme_minimal()

#both together in gray with compare in color with states labeled w/ ..._all df
ggplot() +
  geom_point(data = housing_2024_all, 
             aes(x = Months_Supply, y = Median_Sale_Price_per_k), 
             color = "gray70", alpha = 0.3, size = 1) +
  # geom_path(data = housing_compare %>% filter(StateGroup == "States with the Most Houses"),
  #           aes(x = Months_Supply, y = Median_Sale_Price_per_k, group = State, color = State),
  #           alpha = 0.8, size = 1) +
  geom_point(data = housing_compare_all %>% filter(StateGroup == "States with the Most Houses"),
             aes(x = Months_Supply, y = Median_Sale_Price_per_k, color = State),
             alpha = 0.5, size = 2) +
  # geom_path(data = housing_compare %>% filter(StateGroup == "States with the Fewest Houses"),
  #           aes(x = Months_Supply, y = Median_Sale_Price_per_k, group = State, color = State),
  #           alpha = 0.8, size = 1) +
  geom_point(data = housing_compare_all %>% filter(StateGroup == "States with the Fewest Houses"),
             aes(x = Months_Supply, y = Median_Sale_Price_per_k, color = State),
             alpha = 0.5, size = 2) +
  scale_color_manual(values = state_colors_all) +
  labs(
    title = "Housing Supply vs Median Price: All Counties with Highlights",
    subtitle = "Gray points: All counties | Blue shades: States with Most Houses | Red shades: States with Fewest Houses",
    x = "Months of Supply",
    y = "Median Sale Price (in thousands)",
    color = "State"
  ) +
  theme_minimal()

new construction vs median sale price

housing_constr = housing_2024
housing_constr = housing_constr %>% 
  mutate(constr_bins = cut(New_Constr_by_share_of_sales_percent, breaks=10))

ggplot(housing_constr, aes(x = constr_bins, y = Median_Sale_Price_per_k,fill=constr_bins)) +
  geom_boxplot() +
  labs(
    title = "Median Sale Price by Percent New Construction (2024)",
    x = "New Construction by Share of Sales Percent",
    y = "Median Sale Price (in thousands)"
  ) +
  theme_minimal()

# Perhaps new construction indicates high demand, and as such, the more new construction, the higher the median sale price?
#new construction vs median sale price w/ ..._all df
housing_constr_all = housing_2024_all
housing_constr_all = housing_constr_all %>% 
  mutate(constr_bins = cut(New_Constr_by_share_of_sales_percent, breaks=10))

ggplot(housing_constr_all, aes(x = constr_bins, y = Median_Sale_Price_per_k, fill=constr_bins)) +
  geom_boxplot() +
  labs(
    title = "Median Sale Price by Percent New Construction (2024)",
    x = "New Construction by Share of Sales Percent",
    y = "Median Sale Price (in thousands)"
  ) +
  theme_minimal()

# Perhaps new construction indicates high demand, and as such, the more new construction, the higher the median sale price?

Limit to Entry-Level houses

housing_constr_tiers = housing_2024_tiers
housing_constr_tiers = housing_constr_tiers %>% 
  mutate(constr_bins = cut(New_Constr_by_share_of_sales_percent, breaks=10))

ggplot(filter(housing_constr,Affordability=="entrylevel"), aes(x = constr_bins, y = Median_Sale_Price_per_k,fill=constr_bins)) +
  geom_boxplot() +
  labs(
    title = "Median Sale Price by Percent New Construction (2024)",
    x = "New Construction by Share of Sales Percent",
    y = "Median Sale Price (in thousands)"
  ) +
  theme_minimal()

# Even more pronounced here

## Entry-Level Focused Analysis

### Boxplot of Median Sale Price by State (Entry-Level Only)
housing_entrylevel <- housing_2024_tiers %>%
  filter(Affordability == "entrylevel")
housing_moveup <- housing_2024_tiers %>%
  filter(Affordability == "moveup")

ggplot(housing_entrylevel, aes(x = reorder(State, -Median_Sale_Price_per_k, median), 
                               y = Median_Sale_Price_per_k)) +
  geom_boxplot(fill = "skyblue", alpha = 0.7) +
  coord_flip() +
  labs(
    title = "Median Sale Price by State (Entry-Level Homes, 2024)",
    x = "State",
    y = "Median Sale Price (in thousands)"
  ) +
  theme_minimal()

Histogram of Median Sale Prices (Entry-Level vs. Moveup)

ggplot(housing_entrylevel, aes(x = Median_Sale_Price_per_k)) +
  geom_histogram(binwidth = 50, fill = "lightgreen", color = "black") +
  scale_x_continuous(labels = scales::dollar_format(prefix = "$", suffix = "k")) +
  labs(
    title = "Distribution of Median Sale Prices (Entry-Level Homes, 2024)",
    x = "Median Sale Price (in thousands)",
    y = "Number of Counties"
  ) +
  theme_minimal()

ggplot(housing_2024_tiers, aes(x = Median_Sale_Price_per_k, fill = Affordability)) +
  geom_histogram(
    binwidth = 50,
    color = "black",
    alpha = 0.5,
    position = "identity"
  ) +
  scale_fill_manual(
    values = c("entrylevel" = "lightgreen", "moveup" = "steelblue"),
    name = "Home Type",
    labels = c("Entry-Level", "Move-Up")
  ) +
  scale_x_continuous(labels = scales::dollar_format(prefix = "$", suffix = "k")) +
  labs(
    title = "Distribution of Median Sale Prices (Entry-Level vs Move-Up, 2024)",
    x = "Median Sale Price (in thousands)",
    y = "Count"
  ) +
  theme_minimal()

Scatterplot: Months of Supply vs Price (Entry-Level)

ggplot(housing_entrylevel, aes(x = Months_Supply, y = Median_Sale_Price_per_k)) +
  geom_point(color = "tomato", alpha = 0.6) +
  labs(
    title = "Months of Supply vs Median Sale Price (Entry-Level Homes, 2024)",
    x = "Months of Supply",
    y = "Median Sale Price (in thousands)"
  ) +
  theme_minimal()

Correlation Heatmap of Numeric Variables (Entry-Level)

housing_numeric <- housing_2024 %>%
  select(where(is.numeric), -Year) %>%
  drop_na()

cor_matrix <- cor(housing_numeric)

corrplot(cor_matrix, method = "color", type = "upper",
         tl.cex = 0.8, addCoef.col = "black", number.cex = 0.5)